维度、列名、数据中的属性等:
dim(iris)
## [1] 150 5
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
head(iris) # tail(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
attributes(iris)
## $names
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
##
## $row.names
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## [18] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [35] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [52] 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## [69] 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
## [86] 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
## [103] 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## [120] 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
## [137] 137 138 139 140 141 142 143 144 145 146 147 148 149 150
##
## $class
## [1] "data.frame"
idx <- sample(1:nrow(iris), 5) # 随机抽取5个
idx
## [1] 37 115 12 139 36
iris[idx,]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 37 5.5 3.5 1.3 0.2 setosa
## 115 5.8 2.8 5.1 2.4 virginica
## 12 4.8 3.4 1.6 0.2 setosa
## 139 6.0 3.0 4.8 1.8 virginica
## 36 5.0 3.2 1.2 0.2 setosa
# 抽取某列的3中方法
iris[1:10, "Sepal.Length"] # iris[1:10, 1] # iris$Sepal.Length[1:10]
## [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9
单个变量等数据分布情况、方差、直方图:
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
quantile(iris$Sepal.Length)
## 0% 25% 50% 75% 100%
## 4.3 5.1 5.8 6.4 7.9
quantile(iris$Sepal.Length, c(0.1, 0.3, 0.65))
## 10% 30% 65%
## 4.80 5.27 6.20
var(iris$Sepal.Length)
## [1] 0.6856935
hist(iris$Sepal.Length)
plot(density(iris$Sepal.Length))
plot(density(iris$Sepal.Length, bw = .5))
对于 factor 变量,可以先用 table 求出各个种类的个数,再画出:
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
pie(table(iris$Species)) # 饼图
barplot(table(iris$Species)) # 柱状图
方差、协方差、相关系数(值在-1,1之间,越靠近两端,表示越相关): \[ Cov(X,Y) = E[(X-E[X])(Y-E[Y])] \\ Cor(X,Y) = \frac{Cov(X,Y)}{\sqrt{D(X)}\sqrt{D(Y)}} \]
library(ggplot2)
cov(iris$Sepal.Length, iris$Petal.Length)
## [1] 1.274315
ggplot(iris, aes(Sepal.Length, Petal.Length)) + geom_point() + geom_smooth()
cov(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.6856935 -0.0424340 1.2743154 0.5162707
## Sepal.Width -0.0424340 0.1899794 -0.3296564 -0.1216394
## Petal.Length 1.2743154 -0.3296564 3.1162779 1.2956094
## Petal.Width 0.5162707 -0.1216394 1.2956094 0.5810063
cor(iris$Sepal.Length, iris$Petal.Length)
## [1] 0.8717538
cor(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
## Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
## Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
## Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
使用 aggregate 合并数据:
# 以 Species 分组,然后在分组后的数据上调用 summary
aggregate(Sepal.Length ~ Species, summary, data = iris)
## Species Sepal.Length.Min. Sepal.Length.1st Qu. Sepal.Length.Median
## 1 setosa 4.300 4.800 5.000
## 2 versicolor 4.900 5.600 5.900
## 3 virginica 4.900 6.225 6.500
## Sepal.Length.Mean Sepal.Length.3rd Qu. Sepal.Length.Max.
## 1 5.006 5.200 5.800
## 2 5.936 6.300 7.000
## 3 6.588 6.900 7.900
ggplot(iris, aes(Species, Sepal.Length)) + geom_boxplot() + xlab("Species") + ylab("Sepal.Length")
从上图可以看出,可以使用 Sepal.Length 开区分出 Species。
ggplot(iris, aes(Sepal.Length, Sepal.Width, col = Species)) + geom_point(aes(shape = Species))
# 当有很多点重复的时候,可以添加点噪声,来加以区分
ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_jitter(shape = 1)
pairs(iris)
library(scatterplot3d)
scatterplot3d(iris$Petal.Width, iris$Sepal.Length, iris$Sepal.Width)
# library(rgl)
# plot3d(iris$Petal.Width, iris$Sepal.Length, iris$Sepal.Width)
distMatrix <- as.matrix(dist(iris[,1:4]))
heatmap(distMatrix)
library(lattice)
levelplot(Petal.Width~Sepal.Length*Sepal.Width, iris, cuts=9, col.regions=grey.colors(10)[10:1])
filled.contour(volcano, color=terrain.colors, asp=1, plot.axes=contour(volcano, add=T))
persp(volcano, theta=25, phi=30, expand=0.5, col="lightblue")
parallelplot(~iris[1:4] | Species, data=iris)
qplot(Sepal.Length, Sepal.Width, data=iris, facets=Species ~.)